load packages
library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)
load coded data
coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/FP_manuallyCoded_edited2.csv") %>%
select(subjectID, run, volume, trash) %>%
rename("artifact" = trash)
load stripe detection data
fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/FP/"
filePattern = "FP_stripes_.*.csv"
file_list = list.files(fileDir, pattern = filePattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("stripes")){
temp = read.csv(paste0(fileDir,file))
stripes = data.frame(temp) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.csv(paste0(fileDir,file))
temp_dataset = data.frame(temp_dataset) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
stripes = rbind(stripes, temp_dataset)
rm(temp_dataset)
}
}
load global intensities and rps
# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/FP_rp_txt/'
# outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
# plotDir = '~/Documents/code/tds_auto-motion/auto-motion-output/plots/'
study = "FP"
rpPattern = '^rp_(FP[0-9]{3})_(.*).txt'
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")
# global intensities
intensities = read.csv(paste0('~/Documents/code/dsnlab/automotion-test-set/',study,'_globalIntensities.csv'))
# rp files
file_list = list.files(rpDir, pattern = rpPattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("rp")){
temp = read.table(paste0(rpDir,file))
colnames(temp) = rpCols
rp = data.frame(temp, file = rep(file,count(temp))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern)
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.table(paste0(rpDir,file))
colnames(temp_dataset) = rpCols
temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern)
rp = rbind(rp, temp_dataset)
rm(temp_dataset)
}
}
join dataframes
joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
left_join(., rp, by = c("subjectID", "run", "volume")) %>%
mutate(tile = paste0("tile_",tile)) %>%
group_by(subjectID, run, tile) %>%
mutate(Diff.mean = volMean - lag(volMean),
Diff.sd = volSD - lag(volSD)) %>%
spread(tile, freqtile_power)
split the data
set.seed(101)
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)
machine learning
tidy training and testing data
train.ml = training %>%
group_by(subjectID, run) %>%
mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
gather(tile,freqtile_power, starts_with("tile")) %>%
mutate(tile = paste0(tile,"_c")) %>%
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
select(-trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
select(subjectID, run, volume, artifact, everything())
test.ml = testing %>%
group_by(subjectID, run) %>%
mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
gather(tile,freqtile_power, starts_with("tile")) %>%
mutate(tile = paste0(tile,"_c")) %>%
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
select(-trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
select(subjectID, run, volume, artifact, everything())
use lasso logistic regression to fit beta weights for each predictor
training
# subset predictors and criterion
x_train = as.matrix(train.ml[,-c(1,2,3,4)])
y_train = as.double(as.matrix(train.ml[, 4]))
# run xval to determine lambda
cv.train <- cv.glmnet(x_train, y_train, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
plot(cv.train)

plot(cv.train$glmnet.fit, xvar="lambda", label=TRUE)

cv.train$lambda.min
## [1] 0.001160478
cv.train$lambda.1se
## [1] 0.00564294
coef(cv.train, s=cv.train$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.52394327
## euclidian_trans_deriv .
## euclidian_rot_deriv 1.32222698
## Diff.mean 0.01993830
## Diff.sd -0.03976028
## tile_1_c -33.99427335
## tile_10_c 5652.90255527
## tile_11_c .
## tile_2_c -10.78123078
## tile_3_c -1156.19857225
## tile_4_c .
## tile_5_c 61.23874115
## tile_6_c -431.10703537
## tile_7_c .
## tile_8_c 3396.60610294
## tile_9_c .
coef(cv.train, s=cv.train$lambda.1se)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.130248103
## euclidian_trans_deriv .
## euclidian_rot_deriv .
## Diff.mean 0.008482909
## Diff.sd -0.012461183
## tile_1_c -25.707434181
## tile_10_c 3004.758521999
## tile_11_c .
## tile_2_c .
## tile_3_c .
## tile_4_c .
## tile_5_c .
## tile_6_c .
## tile_7_c .
## tile_8_c 1860.356734575
## tile_9_c .
# test on sample
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
# plot cutoff v. accuracy
predicted = prediction(pred_train, y_train, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
ggplot(perf.df, aes(cut, acc)) +
geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
ggplot(perf.df, aes(fpr, tpr)) +
geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
geom_line()

ggplot(perf.df, aes(x = cut)) +
geom_line(aes(y = sens, color = "sensitivity")) +
geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02925047
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.687121
# confusion matrix
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
pred_train[pred_train > .05] = 1
pred_train[pred_train < .05] = 0
confusionMatrix(pred_train, y_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16387 154
## 1 283 261
##
## Accuracy : 0.9744
## 95% CI : (0.9719, 0.9767)
## No Information Rate : 0.9757
## P-Value [Acc > NIR] : 0.8679
##
## Kappa : 0.5314
## Mcnemar's Test P-Value : 0.0000000009179
##
## Sensitivity : 0.9830
## Specificity : 0.6289
## Pos Pred Value : 0.9907
## Neg Pred Value : 0.4798
## Prevalence : 0.9757
## Detection Rate : 0.9591
## Detection Prevalence : 0.9682
## Balanced Accuracy : 0.8060
##
## 'Positive' Class : 0
##
testing
# subset predictors and criterion
x_test = as.matrix(test.ml[,-c(1,2,3,4)])
y_test = as.double(as.matrix(test.ml[, 4]))
# test on holdout sample
pred_test = predict(cv.train, newx = x_test, s=cv.train$lambda.1se, type="response")
pred_test[pred_test > .05] = 1
pred_test[pred_test < .05] = 0
# confusion matrix
confusionMatrix(pred_test, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5470 47
## 1 87 91
##
## Accuracy : 0.9765
## 95% CI : (0.9722, 0.9802)
## No Information Rate : 0.9758
## P-Value [Acc > NIR] : 0.3862399
##
## Kappa : 0.564
## Mcnemar's Test P-Value : 0.0007542
##
## Sensitivity : 0.9843
## Specificity : 0.6594
## Pos Pred Value : 0.9915
## Neg Pred Value : 0.5112
## Prevalence : 0.9758
## Detection Rate : 0.9605
## Detection Prevalence : 0.9687
## Balanced Accuracy : 0.8219
##
## 'Positive' Class : 0
##
svm
training
train.svm = train.ml[,-c(1,2,3)] %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# specify control parameters
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)
# run initial model
set.seed(101)
svmFit = train(artifact ~ .,
data = train.svm,
method = "svmLinear",
trControl = fitControl,
preProcess = c("center", "scale"),
tuneLength = 10,
metric = "ROC",
verbose = FALSE)
svmFit$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 619
##
## Objective Function Value : -609.3917
## Training error : 0.015335
## Probability model included.
# predict model
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
select(-no)
# plot cutoff v. accuracy
predicted = prediction(train_pred, train.svm$artifact, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
ggplot(perf.df, aes(cut, acc)) +
geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
ggplot(perf.df, aes(fpr, tpr)) +
geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
geom_line()

ggplot(perf.df, aes(x = cut)) +
geom_line(aes(y = sens, color = "sensitivity")) +
geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.0307682
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.693839
# cut and assess accuracy in training sample
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
select(-no)
train_pred = as.matrix(train_pred)
train_pred[train_pred > .09] = "yes"
train_pred[train_pred < .09] = "no"
confusionMatrix(train_pred, train.svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 16577 172
## yes 93 243
##
## Accuracy : 0.9845
## 95% CI : (0.9825, 0.9863)
## No Information Rate : 0.9757
## P-Value [Acc > NIR] : 0.000000000000001021
##
## Kappa : 0.6393
## Mcnemar's Test P-Value : 0.000001655374433974
##
## Sensitivity : 0.9944
## Specificity : 0.5855
## Pos Pred Value : 0.9897
## Neg Pred Value : 0.7232
## Prevalence : 0.9757
## Detection Rate : 0.9703
## Detection Prevalence : 0.9803
## Balanced Accuracy : 0.7900
##
## 'Positive' Class : no
##
testing
test.svm = test.ml[,-c(1,2,3)] %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# cut and assess accuracy in test sample
test_pred = predict(svmFit, newdata = test.svm, type="prob") %>%
select(-no)
test_pred = as.matrix(test_pred)
test_pred[test_pred > .09] = "yes"
test_pred[test_pred < .09] = "no"
confusionMatrix(test_pred, test.svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5524 44
## yes 33 94
##
## Accuracy : 0.9865
## 95% CI : (0.9831, 0.9893)
## No Information Rate : 0.9758
## P-Value [Acc > NIR] : 0.000000007494
##
## Kappa : 0.7025
## Mcnemar's Test P-Value : 0.2545
##
## Sensitivity : 0.9941
## Specificity : 0.6812
## Pos Pred Value : 0.9921
## Neg Pred Value : 0.7402
## Prevalence : 0.9758
## Detection Rate : 0.9700
## Detection Prevalence : 0.9777
## Balanced Accuracy : 0.8376
##
## 'Positive' Class : no
##
save models
setwd("~/Documents/code/dsnlab/automotion-test-set")
saveRDS(cv.train, "model_log_FP.rds")
saveRDS(svmFit, "model_svm_FP.rds")
weighted model
# create model weights (they sum to one)
# model_weights = ifelse(train.svm$artifact == "yes",
# (1/table(train.svm$artifact)[1]) * 0.5,
# (1/table(train.svm$artifact)[2]) * 0.5)
#
# # use the same seed to ensure same cross-validation splits
# fitControl$seeds = svmFit$control$seeds
#
# svmFit_weighted = train(artifact ~ .,
# data = train.svm,
# method = "svmLinear",
# trControl = fitControl,
# preProcess = c("center", "scale"),
# tuneLength = 10,
# metric = "ROC",
# verbose = FALSE,
# weights = model_weights)
#
# svmFit_weighted$finalModel
#
# test_pred_weighted = predict(svmFit_weighted, newdata = test.svm, type="prob") %>%
# select(-no)
# test_pred_weighted = as.matrix(test_pred_weighted)
# test_pred_weighted[test_pred_weighted > .07] = "yes"
# test_pred_weighted[test_pred_weighted < .07] = "no"
# confusionMatrix(test_pred_weighted, test.svm$artifact)
# # determine best cost parameter
# grid <- expand.grid(C = c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 5))
# set.seed(3233)
# svm_Linear_Grid <- train(artifact ~ .,
# data = train.svm, method = "svmLinear",
# trControl=fitControl,
# preProcess = c("center", "scale"),
# tuneGrid = grid,
# tuneLength = 10)
# svm_Linear_Grid
# plot(svm_Linear_Grid)
# test_pred_grid = predict(svm_Linear_Grid, newdata = test.svm)
# confusionMatrix(test_pred_grid, test.svm$artifact)
auto-motion process
training
train.man = training %>%
gather(tile, freqtile_power, starts_with("tile")) %>%
filter(tile %in% c("tile_1", "tile_10")) %>%
# code trash based on mean, sd, and rp
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd,
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
ifelse(trash.combined == 0 & (artifact == 1), "neg",
ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c("tile_1", "tile_10"))
testing
test.man = testing %>%
gather(tile, freqtile_power, starts_with("tile")) %>%
filter(tile %in% c("tile_1", "tile_10")) %>%
# code trash based on mean, sd, and rp
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd,
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
ifelse(trash.combined == 0 & (artifact == 1), "neg",
ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c("tile_1", "tile_10"))
compare hit rates
training data
# select only one set of observations and code correct rejections
train.tab = train.man %>%
filter(tile == "tile_1") %>%
mutate(hits.tot = ifelse(hits %in% "hit", "hit",
ifelse(hits %in% "neg", "neg",
ifelse(hits %in% "pos", "pos",
ifelse(is.na(hits), "cor.rej", hits)))))
lasso logistic regression
confusionMatrix(pred_train, y_train)$table
## Reference
## Prediction 0 1
## 0 16387 154
## 1 283 261
confusionMatrix(pred_train, y_train)$overall[1]
## Accuracy
## 0.974422
confusionMatrix(pred_train, y_train)$byClass[11]
## Balanced Accuracy
## 0.8059695
svm
confusionMatrix(train_pred, train.svm$artifact)$table
## Reference
## Prediction no yes
## no 16577 172
## yes 93 243
confusionMatrix(train_pred, train.svm$artifact)$overall[1]
## Accuracy
## 0.9844893
confusionMatrix(train_pred, train.svm$artifact)$byClass[11]
## Balanced Accuracy
## 0.7899816
manual
table(train.tab$hits.tot)
##
## cor.rej hit neg pos
## 16614 266 148 57
cor.rej = table(train.tab$hits.tot)[[1]]
hit = table(train.tab$hits.tot)[[2]]
neg = table(train.tab$hits.tot)[[3]]
pos = table(train.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.82"
test data
test.tab = test.man %>%
filter(tile == "tile_1") %>%
mutate(hits.tot = ifelse(hits %in% "hit", "hit",
ifelse(hits %in% "neg", "neg",
ifelse(hits %in% "pos", "pos",
ifelse(is.na(hits), "cor.rej", NA)))))
lasso logistic regression
confusionMatrix(pred_test, y_test)$table
## Reference
## Prediction 0 1
## 0 5470 47
## 1 87 91
confusionMatrix(pred_test, y_test)$overall[1]
## Accuracy
## 0.9764706
confusionMatrix(pred_test, y_test)$byClass[11]
## Balanced Accuracy
## 0.8218822
svm
confusionMatrix(test_pred, test.svm$artifact)$table
## Reference
## Prediction no yes
## no 5524 44
## yes 33 94
confusionMatrix(test_pred, test.svm$artifact)$overall[1]
## Accuracy
## 0.9864794
confusionMatrix(test_pred, test.svm$artifact)$byClass[11]
## Balanced Accuracy
## 0.8376105
manual
table(test.tab$hits.tot)
##
## cor.rej hit neg pos
## 5532 93 45 25
cor.rej = table(test.tab$hits.tot)[[1]]
hit = table(test.tab$hits.tot)[[2]]
neg = table(test.tab$hits.tot)[[3]]
pos = table(test.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.83"
plot composite
combine training and test data
full_man = bind_rows(train.man, test.man)
logistic regression
full_log = bind_rows(train.ml,test.ml)
# re-run lasso logistic regression on full sample
x = as.matrix(full_log[,-c(1,2,3,4)])
y = as.double(as.matrix(full_log[, 4]))
pred = predict(cv.train, newx = x, s=cv.train$lambda.1se, type="response")
pred[pred > .05] = 1
pred[pred < .05] = 0
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21857 201
## 1 370 352
##
## Accuracy : 0.9749
## 95% CI : (0.9728, 0.9769)
## No Information Rate : 0.9757
## P-Value [Acc > NIR] : 0.7878
##
## Kappa : 0.5395
## Mcnemar's Test P-Value : 0.000000000002057
##
## Sensitivity : 0.9834
## Specificity : 0.6365
## Pos Pred Value : 0.9909
## Neg Pred Value : 0.4875
## Prevalence : 0.9757
## Detection Rate : 0.9595
## Detection Prevalence : 0.9683
## Balanced Accuracy : 0.8099
##
## 'Positive' Class : 0
##
svm
full_svm = bind_rows(train.ml,test.ml) %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# re-run svm on full sample
full_pred = predict(svmFit, newdata = full_svm, type="prob") %>%
select(-no)
full_pred = as.matrix(full_pred)
full_pred[full_pred > .09] = "yes"
full_pred[full_pred < .09] = "no"
confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 22101 216
## yes 126 337
##
## Accuracy : 0.985
## 95% CI : (0.9833, 0.9865)
## No Information Rate : 0.9757
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6558
## Mcnemar's Test P-Value : 0.00000149
##
## Sensitivity : 0.9943
## Specificity : 0.6094
## Pos Pred Value : 0.9903
## Neg Pred Value : 0.7279
## Prevalence : 0.9757
## Detection Rate : 0.9702
## Detection Prevalence : 0.9797
## Balanced Accuracy : 0.8019
##
## 'Positive' Class : no
##
manual
full_man1 = full_man %>%
filter(tile == "tile_1")
confusionMatrix(full_man1$trash.combined, full_man1$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 22043 193
## 1 82 359
##
## Accuracy : 0.9879
## 95% CI : (0.9864, 0.9893)
## No Information Rate : 0.9757
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7169
## Mcnemar's Test P-Value : 0.00000000003284
##
## Sensitivity : 0.9963
## Specificity : 0.6504
## Pos Pred Value : 0.9913
## Neg Pred Value : 0.8141
## Prevalence : 0.9757
## Detection Rate : 0.9720
## Detection Prevalence : 0.9806
## Balanced Accuracy : 0.8233
##
## 'Positive' Class : 0
##
plot and compare models
data.plot.log = bind_cols(full_log, as.data.frame(y), as.data.frame(pred)) %>%
mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
ifelse(y == 0 & `1` == 1, "pos",
ifelse(y == 1 & `1` == 0, "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
rename("full_pred" = yes) %>%
mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
ifelse(artifact == "no" & full_pred == "yes", "pos",
ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
plot.comp = full_man %>%
filter(tile == "tile_1") %>%
select(subjectID, run, volume, euclidian_trans_deriv, hits) %>%
rename("auto" = hits) %>%
left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, hits) %>%
rename("log" = hits) %>%
left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits) %>%
rename("svm" = hits) %>%
gather(model, hits, c("auto", "log", "svm")) %>%
mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''))
ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
geom_line(size = .25) +
geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits), size = 2.5) +
geom_text(aes(label = label), size = 1.5) +
facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
theme(axis.text.x = element_text(size = 6))
